Versions

V19 - updated to survey_data.2020-07-08_17.17.10.txt

V18 - updated to survey_data.2020-07-02_11.01.36.txt

V17 - Change sample to dataset; change project to cohort

V16 - use namer on chunks

V15 - change correlation plots to small dots

V14 - combine plots

V13 - change gene count threshold from categporical to numeric

V12 - exclude datasets with fewer than 100 measured genes and use pre-generated survey_data

V11 - color consistency

V10 - calculate non genic and all exonic counts for correlations analysis

V9 - add gene count; import readdist.txt for more nuanced comparison of cor with gene count

Overview

reference range for unmapped reads is calculated based on unmapped reads/total

reference range for duplicate reads is calculated based on duplicate reads/mapped reads where mapped reads = total mapped and multi-mapped (<20x) reads)

reference range for non-exonic reads is calculated based on non-exonic/non duplicate reads) where non duplicate reads = exonic and non-exonic non dupe reads)

The composition fractions are not dependent on higher level ones

Libraries

library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  3.0.1     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.6.2
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(viridis)
## Loading required package: viridisLite
library(knitr)
library(ggthemes) # for theme_linedraw() 
library(ggpubr) # for stat_cor
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(cowplot) # for ggdraw and draw_label
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggpubr':
## 
##     get_legend
## The following object is masked from 'package:ggthemes':
## 
##     theme_map
library(RColorBrewer)
library(pander)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows

Config

inputs_dir <- "inputs"
data_file <- "survey_data.2020-07-02_11.50.01.txt"

Settings

min_genes_gt_0 <- 100

Functions

is_outlier <- function(x) {
  (x > quantile(x, 0.75) + 1.5*IQR(x)) |
    (x < quantile(x, 0.25) - 1.5*IQR(x))
}


# StatBin2 allows depiction of empty bins as blank instead of a horizontal line:
# https://stackoverflow.com/questions/57128090/remove-baseline-color-for-geom-histogram
StatBin2 <- ggproto(
  "StatBin2", 
  StatBin,
  compute_group = function (data, scales, binwidth = NULL, bins = NULL, 
                            center = NULL, boundary = NULL, 
                            closed = c("right", "left"), pad = FALSE, 
                            breaks = NULL, origin = NULL, right = NULL, 
                            drop = NULL, width = NULL) {
    if (!is.null(breaks)) {
      if (!scales$x$is_discrete()) {
        breaks <- scales$x$transform(breaks)
      }
      bins <- ggplot2:::bin_breaks(breaks, closed)
    }
    else if (!is.null(binwidth)) {
      if (is.function(binwidth)) {
        binwidth <- binwidth(data$x)
      }
      bins <- ggplot2:::bin_breaks_width(scales$x$dimension(), binwidth, 
                                         center = center, boundary = boundary, 
                                         closed = closed)
    }
    else {
      bins <- ggplot2:::bin_breaks_bins(scales$x$dimension(), bins, 
                                        center = center, boundary = boundary, 
                                        closed = closed)
    }
    res <- ggplot2:::bin_vector(data$x, bins, weight = data$weight, pad = pad)

    # drop 0-count bins completely before returning the dataframe
    res <- res[res$count > 0, ] 

    res
  })

Import data

counts_and_meta <- read_tsv(file.path(inputs_dir, data_file)) %>%
  group_by(cohort) %>%
  mutate(n_in_cohort = n())
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   dataset_id = col_character(),
##   disease = col_character(),
##   dataset_prefix = col_character(),
##   pedaya = col_character(),
##   gender = col_character(),
##   doi_link = col_character(),
##   source_accession = col_character(),
##   repo = col_character(),
##   original_dataset_id = col_character(),
##   cohort = col_character()
## )
## See spec(...) for full column specifications.

Colors

# brewer.pal(8, "Paired")

these_colors <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", 
"#FDBF6F", "#FF7F00")

# light blue, dark blue, etc: green, red, orange (then purple and brown)

these_colors <- c("#1F78B4", "#1F78B4", "#33A02C", "#33A02C", "#E31A1C", "#E31A1C", 
"#FF7F00", "#FF7F00")



names(these_colors) <-  c("not assigned", "Total reads", "Non-exonic reads",  "Exonic reads", "Duplicate reads",  "Non-duplicate reads",  "Unmapped reads", "Mapped reads")

these_colors_with_MEND = c(these_colors, "MEND reads"="#000000")

Datasets analyzed

nrow(counts_and_meta)
## [1] 2178

Diseases in dataset

counts_and_meta %>% tabyl(disease)
##                                     disease   n      percent
##                acute lymphoblastic leukemia 680 0.3122130395
##             acute megakaryoblastic leukemia 103 0.0472910927
##                      acute myeloid leukemia 221 0.1014692378
##             acute undifferentiated leukemia   1 0.0004591368
##                      adrenocortical adenoma   1 0.0004591368
##                       adrenocortical cancer   2 0.0009182736
##                    adrenocortical carcinoma  19 0.0087235996
##                   alveolar rhabdomyosarcoma  40 0.0183654729
##                          cholangiocarcinoma   1 0.0004591368
##                    choroid plexus carcinoma  25 0.0114784206
##         desmoplastic small round cell tumor   4 0.0018365473
##      dysembryoplastic neuroepithelial tumor  13 0.0059687787
##                  embryonal rhabdomyosarcoma  42 0.0192837466
##  embryonal tumor with multilayered rosettes   1 0.0004591368
##                                  ependymoma  98 0.0449954086
##                         epithelioid sarcoma   1 0.0004591368
##                               Ewing sarcoma  70 0.0321395776
##      fibrolamellar hepatocellular carcinoma   3 0.0013774105
##                                fibromatosis   1 0.0004591368
##              gastrointestinal stromal tumor   4 0.0018365473
##                             germ cell tumor   1 0.0004591368
##                     glioblastoma multiforme  29 0.0133149679
##                                      glioma 193 0.0886134068
##                              hepatoblastoma   1 0.0004591368
##                    hepatocellular carcinoma   3 0.0013774105
##                 hypereosinophillic syndrome   1 0.0004591368
##            juvenile myelomonocytic leukemia   1 0.0004591368
##                                    leukemia   1 0.0004591368
##                                    lymphoma  49 0.0224977043
##                             medulloblastoma 201 0.0922865014
##                                    melanoma   7 0.0032139578
##             melanotic neuroectodermal tumor   1 0.0004591368
##                                  meningioma   1 0.0004591368
##                             myofibromatosis   2 0.0009182736
##                    nasopharyngeal carcinoma   2 0.0009182736
##                               neuroblastoma  12 0.0055096419
##                    neuroendocrine carcinoma   1 0.0004591368
##                                not reported   2 0.0009182736
##                not reported - QC fail by PI   1 0.0004591368
##                                osteosarcoma 157 0.0720844812
##           ovarian serous cystadenocarcinoma   1 0.0004591368
##                                      PEComa   1 0.0004591368
##                    pleuropulmonary blastoma   1 0.0004591368
##                              retinoblastoma   2 0.0009182736
##                              rhabdoid tumor  65 0.0298438935
##                            rhabdomyosarcoma  53 0.0243342516
##    spindle cell/sclerosing rhabdomyosarcoma   3 0.0013774105
##          supratentorial embryonal tumor NOS  18 0.0082644628
##                            synovial sarcoma  22 0.0101010101
##                undifferentiated sarcoma NOS   3 0.0013774105
##       undifferentiated spindle cell sarcoma   2 0.0009182736
##                                 wilms tumor  11 0.0050505051
disease_freq_table <- counts_and_meta$disease %>%
  str_to_sentence %>%
  str_replace(" nos$", " NOS") %>%
  as.factor %>%
  fct_infreq %>%
  fct_lump(prop=0.01) %>%
  tabyl (var1 = "Disease") %>%
  adorn_pct_formatting(digits = 1)

colnames(disease_freq_table)[1]="Disease"

write_tsv(disease_freq_table, "results/disease_freq_table.txt")

disease_freq_table  %>%
  kable %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)
Disease n percent
Acute lymphoblastic leukemia 680 31.2%
Acute myeloid leukemia 221 10.1%
Medulloblastoma 201 9.2%
Glioma 193 8.9%
Osteosarcoma 157 7.2%
Acute megakaryoblastic leukemia 103 4.7%
Ependymoma 98 4.5%
Ewing sarcoma 70 3.2%
Rhabdoid tumor 65 3.0%
Rhabdomyosarcoma 53 2.4%
Lymphoma 49 2.2%
Embryonal rhabdomyosarcoma 42 1.9%
Alveolar rhabdomyosarcoma 40 1.8%
Glioblastoma multiforme 29 1.3%
Choroid plexus carcinoma 25 1.1%
Synovial sarcoma 22 1.0%
Other 130 6.0%
# many are leukemias
table(str_detect(counts_and_meta$disease, "leukemia"))
## 
## FALSE  TRUE 
##  1171  1007

Cohorts in dataset

library(ggthemes)

old_cohort_size_histogram <- ggplot(counts_and_meta) + 
  geom_histogram(aes(n_in_cohort, group=cohort), 
                 fill = "lightgrey", color = "black", stat = StatBin2) +
  # theme_linedraw(base_size = 20) +
  theme_minimal(base_size = 20) +
  theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +  xlab("Cohort size") +
  ylab("Cohorts") + 
  theme(legend.position="none")

cohort_size_histogram <- ggplot(counts_and_meta %>%
  select(cohort, n_in_cohort) %>%
  distinct ) + 
  geom_histogram(aes(n_in_cohort), 
                 fill = "lightgrey", color = "black", stat = StatBin2,
                 # binwidth = 10,
                 breaks = seq(0,500, by=10)) +
  # theme_linedraw(base_size = 20) +
  theme_minimal(base_size = 20) +
  theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +  
  xlab("Cohort size") +
  scale_y_continuous("Cohorts", breaks = seq(0, 8, 2)) + 
  theme(legend.position="none")

cohort_size_histogram

# for cohorts that are more than 75% one disease, color by that disease

n_distinct(counts_and_meta$cohort)
## [1] 38
sort(unique(counts_and_meta$n_in_cohort))
##  [1]   3   4   6   7  10  14  15  17  19  22  23  24  25  31  35  39  41
## [18]  52  62  63  65  73  75  84  88  89 127 137 337 406
counts_and_meta %>%
  select(cohort, n_in_cohort) %>%
  distinct %>%
  arrange(desc(n_in_cohort)) %>%
  kable
cohort n_in_cohort
St Jude Cloud NOS 406
TARGET-10 337
TARGET-20 137
EGAD00001003279 127
phs000673.v2.p1 89
TARGET-40 88
phs000720.v2.p1 84
10.1038/ng.2938 75
10.1038/nature13109 73
TARGET-52 65
EGAD00001001098 63
10.1056/NEJMoa1403088 63
phs000768.v2.p1 62
10.1038/ng.2611 52
10.1016/j.ccr.2013.11.002 41
EGAD00001001620 39
phs000699.v1.p1 35
10.24370/SD_BHJXBDQK 31
EGAD00001001666 25
phs000900.v1.p1 24
EGAD00001000648 24
TARGET-21 23
EGAD00001000356 23
EGAD00001000617 23
10.1016/j.celrep.2014.03.003 23
EGAD00001001927 22
EGAD00001002680 19
SRP126664 19
EGAD00001000158 17
SRP092501 15
10.1016/j.ccr.2012.10.007 14
EGAD00001000826 10
TARGET-30 7
EGAD00001000328 6
10.1038/ng.3230 6
TARGET-50 4
SRP006575 4
SRP040454 3
counts_and_meta %>%
  select(cohort, n_in_cohort) %>%
  distinct %>%
  pull(n_in_cohort) %>%
  median
## [1] 24.5

Ages in dataset

table(counts_and_meta$pedaya, useNA = "always")
## 
##                  No             Unknown Yes, age < 30 years 
##                  51                   3                1557 
##                <NA> 
##                 567
table(subset(counts_and_meta, is.na(pedaya))$dataset_prefix)
## 
## TARGET  THR39 
##    548     19
n_ped_aya <- sum(counts_and_meta$pedaya=="Yes, age < 30 years" | grepl("TARGET", counts_and_meta$dataset_prefix), na.rm = TRUE)
n_ped_aya
## [1] 2105
n_ped_aya/nrow(counts_and_meta)
## [1] 0.966483
# more than 95% of which are pediatric <30; of the Datasets with specific ages at diagnosis reported, the median was 7

median(counts_and_meta$age_at_dx, na.rm = TRUE)
## [1] 8.05
table(is.na(counts_and_meta$age_at_dx))
## 
## FALSE  TRUE 
##  1250   928

Read lengths in dataset

seq_len_round_25 <- round(counts_and_meta$seq_length / 25) * 25
table(seq_len_round_25, useNA = "always")
## seq_len_round_25
##   25   50   75  100  125 <NA> 
##    4  238  306 1358   27  245
tabyl(seq_len_round_25)
##  seq_len_round_25    n     percent valid_percent
##                25    4 0.001836547   0.002069322
##                50  238 0.109274564   0.123124677
##                75  306 0.140495868   0.158303156
##               100 1358 0.623507805   0.702534920
##               125   27 0.012396694   0.013967926
##                NA  245 0.112488522            NA
table(is.na(counts_and_meta$seq_length))
## 
## FALSE  TRUE 
##  1933   245
table(counts_and_meta$seq_length)
## 
##   36   50   51   75   76  100  101  110  125 
##    4   11  227  266   40   88 1255   15   27
summary(counts_and_meta$seq_length)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   36.00   75.00  101.00   90.97  101.00  125.00     245
IQR(counts_and_meta$seq_length, na.rm = TRUE)
## [1] 26
 old_read_length_histogram <-  ggplot(counts_and_meta) + 
  geom_histogram(aes(seq_length, group=cohort), 
                 fill = "lightgrey", color = "black", stat = StatBin2) +
#  theme_linedraw(base_size = 20) +
      theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  xlab("Sequence length") +
    ylab("Datasets")
 
 read_length_histogram <-  ggplot(counts_and_meta) + 
  geom_histogram(aes(seq_length), 
                 fill = "lightgrey", color = "black", stat = StatBin2,
                 binwidth = 1) +
#  theme_linedraw(base_size = 20) +
      theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  xlab("Sequence length") +
    ylab("Datasets") +
   expand_limits(x=0)
  
read_length_histogram
## Warning: Removed 245 rows containing non-finite values (stat_bin2).

subset(counts_and_meta, Mapped > Total_reads)
## # A tibble: 0 x 23
## # Groups:   cohort [0]
## # … with 23 variables: dataset_id <chr>, MND <dbl>, MEND <dbl>,
## #   Total_reads <dbl>, Mapped <dbl>, seq_length <dbl>, disease <chr>,
## #   dataset_prefix <chr>, age_at_dx <dbl>, pedaya <chr>, gender <chr>,
## #   doi_link <chr>, source_accession <chr>, repo <chr>,
## #   original_dataset_id <chr>, genes_expressed_above_0 <dbl>,
## #   genes_expressed_above_1 <dbl>, genes_expressed_above_2 <dbl>,
## #   genes_expressed_above_3 <dbl>, genes_expressed_above_4 <dbl>,
## #   genes_expressed_above_5 <dbl>, cohort <chr>, n_in_cohort <int>
subset(counts_and_meta, MND > Mapped)
## # A tibble: 0 x 23
## # Groups:   cohort [0]
## # … with 23 variables: dataset_id <chr>, MND <dbl>, MEND <dbl>,
## #   Total_reads <dbl>, Mapped <dbl>, seq_length <dbl>, disease <chr>,
## #   dataset_prefix <chr>, age_at_dx <dbl>, pedaya <chr>, gender <chr>,
## #   doi_link <chr>, source_accession <chr>, repo <chr>,
## #   original_dataset_id <chr>, genes_expressed_above_0 <dbl>,
## #   genes_expressed_above_1 <dbl>, genes_expressed_above_2 <dbl>,
## #   genes_expressed_above_3 <dbl>, genes_expressed_above_4 <dbl>,
## #   genes_expressed_above_5 <dbl>, cohort <chr>, n_in_cohort <int>
subset(counts_and_meta, MEND > MND)
## # A tibble: 0 x 23
## # Groups:   cohort [0]
## # … with 23 variables: dataset_id <chr>, MND <dbl>, MEND <dbl>,
## #   Total_reads <dbl>, Mapped <dbl>, seq_length <dbl>, disease <chr>,
## #   dataset_prefix <chr>, age_at_dx <dbl>, pedaya <chr>, gender <chr>,
## #   doi_link <chr>, source_accession <chr>, repo <chr>,
## #   original_dataset_id <chr>, genes_expressed_above_0 <dbl>,
## #   genes_expressed_above_1 <dbl>, genes_expressed_above_2 <dbl>,
## #   genes_expressed_above_3 <dbl>, genes_expressed_above_4 <dbl>,
## #   genes_expressed_above_5 <dbl>, cohort <chr>, n_in_cohort <int>

Summarize total read counts

min(counts_and_meta$Total_reads)
## [1] 206916
Total_read_counts <- counts_and_meta %>% 
  ungroup %>%
  mutate(dataset_value = Total_reads/1e6) %>%
  summarize(variance= var(dataset_value),
            min = min(dataset_value),
            max = max(dataset_value),
            median = median(dataset_value),
            p25 = quantile(dataset_value, 0.25),
            p75 = quantile(dataset_value, 0.75),
            iqr = IQR(dataset_value))

kable(Total_read_counts %>%
        select(-variance, -p25, -p75), digits = 1)
min max median iqr
0.2 668 61.1 53.3
total_read_count_histogram <-  ggplot(counts_and_meta) + 
  geom_histogram(aes(Total_reads/1E6), 
                 fill = "lightgrey", color = "black", stat = StatBin2) +
#  theme_linedraw(base_size = 20) +
      theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  xlab("Total reads") +
    ylab("Datasets")

total_read_count_histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Calculate percentages

 read_counts_with_read_type_fractions <- counts_and_meta  %>%
  arrange(desc(Total_reads)) %>%
  mutate(
    pct_unmapped_of_total = (Total_reads - Mapped) / Total_reads,
    pct_dupe_of_mapped = (Mapped - MND) / Mapped,
    pct_non_exonic_of_non_dupe = (MND - MEND) / MND
  )

read_type_fractions_long <- read_counts_with_read_type_fractions %>%
#  select(c(dataset_id, cohort_id, cohort, starts_with("pct_"))) %>%
  pivot_longer(cols = starts_with("pct_"), names_to = "read_type_fraction_name", values_to = "dataset_value") 

Summarize read type fractions

comparison_of_distributions <- read_type_fractions_long %>% 
  group_by(read_type_fraction_name) %>%
  summarize(variance= var(dataset_value),
            min = min(dataset_value),
            max = max(dataset_value),
            median = median(dataset_value),
            p25 = quantile(dataset_value, 0.25),
            p75 = quantile(dataset_value, 0.75),
            iqr = IQR(dataset_value))

kable(comparison_of_distributions %>%
      mutate_if(is.double, ~100*.), digits = 3)
read_type_fraction_name variance min max median p25 p75 iqr
pct_dupe_of_mapped 5.886 2.853 99.997 26.895 13.449 43.053 29.604
pct_non_exonic_of_non_dupe 4.094 4.495 97.000 24.769 16.197 37.277 21.080
pct_unmapped_of_total 0.606 0.710 76.677 3.414 2.740 6.009 3.269
kable(comparison_of_distributions %>%
        select(-variance, -p25, -p75) %>%
      mutate_if(is.double, ~100*.), digits = 0)
read_type_fraction_name min max median iqr
pct_dupe_of_mapped 3 100 27 30
pct_non_exonic_of_non_dupe 4 97 25 21
pct_unmapped_of_total 1 77 3 3

statements about read type fractions

unmapped

# in 77 datasets, more than 25% of reads are unmapped
table(read_counts_with_read_type_fractions$pct_unmapped_of_total>0.25)
## 
## FALSE  TRUE 
##  2101    77

duplicate

# in 77 datasets, more than 25% of reads are unmapped
read_counts_with_read_type_fractions %>%
  mutate(above_50 = pct_dupe_of_mapped>0.50) %>%
  tabyl(above_50)
##  above_50    n   percent
##     FALSE 1752 0.8044077
##      TRUE  426 0.1955923
fiftypct <- read_counts_with_read_type_fractions %>%
  group_by(cohort) %>%
  summarize(has_a_dataset_with_more_than_50pct_dupes = any(pct_dupe_of_mapped>0.50),
            median_pct_dupe_of_mapped = median(pct_dupe_of_mapped)) %>%
  mutate(median_lt_50 = median_pct_dupe_of_mapped < 0.5,
         median_lt_50_and_has_a_dataset = median_lt_50 & has_a_dataset_with_more_than_50pct_dupes)


fiftypct %>% tabyl(median_lt_50)
##  median_lt_50  n   percent
##         FALSE  7 0.1842105
##          TRUE 31 0.8157895
fiftypct %>% tabyl(has_a_dataset_with_more_than_50pct_dupes)
##  has_a_dataset_with_more_than_50pct_dupes  n   percent
##                                     FALSE  8 0.2105263
##                                      TRUE 30 0.7894737
fiftypct %>% tabyl(median_lt_50_and_has_a_dataset)
##  median_lt_50_and_has_a_dataset  n   percent
##                           FALSE 15 0.3947368
##                            TRUE 23 0.6052632

exonic

read_counts_with_read_type_fractions %>%
  mutate(above_50 = pct_non_exonic_of_non_dupe>0.50) %>%
  tabyl(above_50)
##  above_50    n   percent
##     FALSE 1848 0.8484848
##      TRUE  330 0.1515152
fiftypcte <- read_counts_with_read_type_fractions %>%
  group_by(cohort) %>%
  summarize(has_a_dataset_with_more_than_50pct_ne = any(pct_non_exonic_of_non_dupe>0.50),
            median_pct_dupe_of_mapped = median(pct_non_exonic_of_non_dupe)) %>%
  mutate(median_lt_50 = median_pct_dupe_of_mapped < 0.5,
         median_lt_50_and_has_a_dataset = median_lt_50 & has_a_dataset_with_more_than_50pct_ne)


fiftypcte %>% tabyl(median_lt_50)
##  median_lt_50  n    percent
##         FALSE  3 0.07894737
##          TRUE 35 0.92105263
fiftypcte %>% tabyl(has_a_dataset_with_more_than_50pct_ne)
##  has_a_dataset_with_more_than_50pct_ne  n   percent
##                                  FALSE 26 0.6842105
##                                   TRUE 12 0.3157895
fiftypcte %>% tabyl(median_lt_50_and_has_a_dataset)
##  median_lt_50_and_has_a_dataset  n   percent
##                           FALSE 29 0.7631579
##                            TRUE  9 0.2368421

Does the depth of sequencing correlate to the number of duplicates? How much variance does that explain?

cor_tot_reads_and_dupes <- cor(read_counts_with_read_type_fractions$Total_reads, 
    read_counts_with_read_type_fractions$pct_dupe_of_mapped,
    method = c("pearson"))
round(cor_tot_reads_and_dupes, 2)
## [1] 0.58
round(cor_tot_reads_and_dupes^2, 2)
## [1] 0.34

more exploration

# p <- ggplot(read_counts_with_read_type_fractions, aes(x=Total_reads/1e6, y=pct_dupe_of_mapped)) + 
#   geom_point() + 
#   geom_smooth(method=lm, se=FALSE)
# 
# # Add correlation coefficient
# p + stat_cor(method = "pearson") #, label.x = 250 * 1e6, label.y = 0.2)
# 0.64^2   

this_data <- read_counts_with_read_type_fractions %>%
  select(Total_reads, pct_dupe_of_mapped)
## Adding missing grouping variables: `cohort`
this_plot_title <- paste("Correlation of Total_reads and % Duplicates")

these_stats <- this_data  %>%
  ungroup %>%
  summarize(r_corr = round(cor(Total_reads, pct_dupe_of_mapped), 2))

ggplot(this_data,
       aes(x=Total_reads/1E6, y=pct_dupe_of_mapped)) + 
  geom_point(alpha = 0.5, shape = 20, size =0.1) +
  geom_smooth(method = "lm", color = "black", se = FALSE) +
  geom_label(data = these_stats, 
             aes(label=paste0("r=", r_corr)), 
             x=max(this_data$Total_reads/1E6),
             y=max(this_data$pct_dupe_of_mapped),
             hjust = 1, vjust = 1
             ) +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  ggtitle(this_plot_title)  + 
  theme(legend.position="none", plot.title=element_text(size=22)) +
  xlab("Read counts (million)") +
  ylab("% Duplicates")
## `geom_smooth()` using formula 'y ~ x'

Generate schematic defining read types

colors_for_read_types <- these_colors_with_MEND
names(colors_for_read_types) <- gsub(" reads", "", names(these_colors_with_MEND))

colors_for_read_types <- c(colors_for_read_types, 
                           "Genome" = "darkblue",
                           "Exon" = "darkblue",
                           "Aligned reads" = "darkgrey")

plot1_colnames <- c("label", "x1", "x2", "y1", "y2")

read_type_schematic_data_as_vector <- c("Duplicate", 3, 4, 5, 6, 
                                     #"Duplicate", 3, 4, 6, 7,
                       "MEND", 3, 4, 4, 5, 
                       "MEND", 2.5, 3.5, 3, 4, 
                       "Non-exonic", 4.5, 5.5, 3, 4,
                       "Unmapped", 6, 7, 3, 4,
                       "Exon", 2.5, 4, 1, 2,
           #            "Genome", 4.5, 5.5, 0.5, 1.5,
                       "Aligned reads", 2.8, 2.8, 6.5, 7)

rows_in_schematic <- length(read_type_schematic_data_as_vector)/length(plot1_colnames)

read_type_schematic_data <- matrix(read_type_schematic_data_as_vector, 
                     ncol = length(plot1_colnames), byrow = TRUE,
                     dimnames = list(c(1:rows_in_schematic), plot1_colnames)) %>%
  as_tibble %>%
  type_convert() %>%
  mutate(
    base_color = colors_for_read_types[match(label, names(colors_for_read_types))],
    border_color = case_when(
      label %in% c("Aligned reads", "Genome", "MEND", "Exon") ~ NA_character_,
      TRUE ~ base_color),
    fill_color = ifelse(label %in% c("MEND", "Exon"),  base_color, "white"),
    text_color = case_when(
      label %in% c("MEND", "Exon")~  "white",
      TRUE ~ base_color),
    mid_bar_x = map2_dbl(x1, x2,  function(x, y) mean(c(x,y))),
    mid_bar_y = map2_dbl(y1, y2,  function(x, y) mean(c(x,y)))
  )
## Parsed with column specification:
## cols(
##   label = col_character(),
##   x1 = col_double(),
##   x2 = col_double(),
##   y1 = col_double(),
##   y2 = col_double()
## )
padding_size = 0.35
end_of_box <- sort(read_type_schematic_data$x2,partial=length(read_type_schematic_data$x2)-1)[length(read_type_schematic_data$x2)-1] + padding_size
read_type_schematic <- ggplot(read_type_schematic_data, 
                              aes(xmin=x1, xmax=x2, ymin=y1, ymax = y2, 
                                  fill = fill_color, color = border_color)) +
  geom_rect() +
  
  geom_segment(x=min(read_type_schematic_data$x1-padding_size), y=1.5, xend = 5.75, yend = 1.5, color = "darkblue") +   
  geom_text(aes(label = label, x = mid_bar_x, y=mid_bar_y,
                color = text_color), 
            size = 4,
            fontface = "bold") +
  geom_rect(aes(xmin = min(x1 - padding_size), 
                ymin = 2.5, 
                xmax = end_of_box,
                ymax=max(y2 + padding_size)), 
            fill = NA, color = "darkgrey", size = 0.25) +
  scale_fill_identity() +
  scale_color_identity() +
  theme_void() + 
  theme(
    panel.grid.major.x = element_line(color="lightgrey", size=0.2),
    #panel.grid.minor.x = element_line(color="lightgrey", size=0.2)
    ) + 
 scale_x_continuous(
   breaks = seq(min(read_type_schematic_data$x1) - padding_size + 0.25, end_of_box , 0.25))
#   minor_breaks = seq(4, 10, 0.25))

read_type_schematic

Generate schematic defining read type fractions

stat_names <- c("pct_unmapped_of_total",  "pct_dupe_of_mapped", "pct_non_exonic_of_non_dupe")
# complement_stat_names <- c("pct_mapped_of_total",  "pct_nondupe_of_mapped", "pct_exonic_of_non_dupe") 
stat_labels <- c("Unmapped",  "Duplicate", "Non-exonic") 
complement_stat_names <- c("Mapped reads",  "Non-duplicate reads", "Exonic reads") 
divisor_name = c("total", "mapped", "non-duplicate")
# names(divisor_name) = 

comparison_of_remaining <- read_type_fractions_long %>% 
  mutate(read_type_fraction_name = complement_stat_names[match(read_type_fraction_name, stat_names)]) %>%
  group_by(read_type_fraction_name) %>%
  summarize(min = round(1-max(dataset_value), 2),
    max = round(1-min(dataset_value), 2),
    median = round(1-median(dataset_value), 2),
    p25 = round(1-quantile(dataset_value, 0.25), 2),
    p75 = round(1-quantile(dataset_value, 0.75), 2)) %>%
      mutate(bar_label = paste0 (read_type_fraction_name, "\n(", 100*min, "-", 100*max, "% of ", divisor_name[match(read_type_fraction_name, complement_stat_names)], "; x͂=",100* median, "%)"))
#  mutate(bar_label = paste0 (read_type_fraction_name, "\n(", 100*min, "-", 100*max, "% of ", divisor_name[match(read_type_fraction_name, complement_stat_names)], "; x=", 100*median, "%)"))

positive_bars <- comparison_of_remaining %>%
  select(-min, -max, -p25, -p75) %>%
  mutate(read_type_fraction_label = read_type_fraction_name, 
         position = "1") %>%
  rename(abs_median = median)

negative_bars <- tibble(read_type_fraction_name = positive_bars$read_type_fraction_name,
                        read_type_fraction_label = stat_labels[match(read_type_fraction_name, complement_stat_names)],
                        bar_label = read_type_fraction_label,
                        abs_median = 1-positive_bars$abs_median,
                        position = "2")

total_bar <- tibble(read_type_fraction_name = "Total reads",
                    read_type_fraction_label = read_type_fraction_name,
                    bar_label = "Total reads\n(100% of reads)",
                    abs_median = 1,
                    position = "1",
                    #category_space = 1
                    )


MEND_stats_of_total <- counts_and_meta  %>%
  arrange(desc(Total_reads)) %>%
  mutate(
    pct_MEND_of_total = (Total_reads - MEND) / Total_reads
  ) %>%
  ungroup %>% 
  summarize(min = round(1-max(pct_MEND_of_total), 2),
    max = round(1-min(pct_MEND_of_total), 2),
    median = round(1-median(pct_MEND_of_total), 2))

MEND_bar <- tibble(read_type_fraction_name = "MEND reads",
                    read_type_fraction_label = read_type_fraction_name,
                    bar_label = with(MEND_stats_of_total, paste0 ("MEND reads\n(", 100*min, "-", 100*max, "% of total reads; x͂=",100* median, "%)")),
                    abs_median = 1,
                    position = "1",
                    #category_space = 1
                    )

bar_names_in_order <- c("Total reads", "Mapped reads", "Non-duplicate reads", "Exonic reads", "MEND reads")
  
all_bars <- bind_rows(positive_bars, negative_bars, total_bar, MEND_bar) %>%
  mutate(read_type_fraction_name = factor(read_type_fraction_name, 
                                          levels = bar_names_in_order))

cat_space <- all_bars %>% filter(position == 1) %>%
  arrange(read_type_fraction_name) %>%
  select(read_type_fraction_name, abs_median) %>%
  ungroup %>%
  mutate(lagged1_abs_median = lag(abs_median, default =1),
         lagged2_abs_median = lag(abs_median, n=2, default =1),
         category_space = lagged1_abs_median*lagged2_abs_median
         ) %>%
  select(read_type_fraction_name, category_space)

these_colors_with_MEND = c(these_colors, "MEND reads"="#000000")


all_bars_rel <- left_join(all_bars, cat_space, by = "read_type_fraction_name") %>%
  mutate(rel_median=ifelse(read_type_fraction_name == "MEND reads",
                           abs_median[read_type_fraction_label=="Exonic reads"]*
                             category_space[read_type_fraction_label=="Exonic reads"], 
                           abs_median*category_space),
         read_type_fraction_name = factor(read_type_fraction_name, levels = rev(bar_names_in_order)),
         position = factor(position, levels = rev(sort(unique(position)))),
         bar_color = these_colors_with_MEND[match(read_type_fraction_name, names(these_colors_with_MEND))],
         fill_val = ifelse(position == 1, bar_color, NA),
#         border_color_val = ifelse(position ==1, NA, bar_color),
         border_color_val = bar_color,
         text_color = ifelse(position ==1, "white", "black"),
font_size = ifelse(abs_median<0.1, 3,3.5),
text_angle = ifelse(abs_median<0.1, 90,0)
         ) 
read_type_fractions_schematic <- ggplot(all_bars_rel, aes(y=read_type_fraction_name,
                x=rel_median, group = position)) + 
   geom_col(aes(fill = fill_val,
                color = border_color_val
                )) +
   geom_text(aes(label=bar_label,
                 color = text_color,
                 size = font_size,
                 angle= text_angle), 
             position = position_stack(vjust = .5),
#             size = 4.5,
             #size = 3.5,
              fontface = "bold") +
   scale_fill_identity() +
   scale_color_identity()  +
  scale_size_identity() + 
   theme_void() +
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) 


ggplot(all_bars_rel, aes(y=read_type_fraction_name,
                x=rel_median, group = position)) + 
   geom_col(aes(fill = fill_val,
                color = border_color_val
                )) +
   geom_text(aes(label=read_type_fraction_label,
                 color = text_color,
                 size = font_size,
                 angle = text_angle), 
             position = position_stack(vjust = .5),
             #size = 4.5,
              fontface = "bold") +
   scale_fill_identity() +
   scale_color_identity()  +
  scale_size_identity() + 
   theme_void() +
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())

read_type_fractions_schematic

figure_name <- "f1"

# plot_grid(cohort_size_histogram,
# read_type_schematic,
# read_length_histogram,
# read_type_fractions_schematic,
#                  nrow=2,
#                   labels = c("A", "C", "B", "D")) +
#   draw_label(paste(figure_name, Sys.time()),
#              x = 0, y = 0.02, hjust = 0, size = 6) 

right_side <- plot_grid(read_type_schematic,
read_type_fractions_schematic,
nrow=2,
rel_heights = c(1, 2),
labels = c("C", "D"))

left_side <- plot_grid(cohort_size_histogram,
read_length_histogram,
nrow=2,
labels = c("A", "B"))
## Warning: Removed 245 rows containing non-finite values (stat_bin2).
plot_grid(left_side,
right_side,
ncol = 2) +
   draw_label(paste(figure_name, Sys.time()),
              x = 0, y = 0.02, hjust = 0, size = 6) 

stat_names <- c("pct_unmapped_of_total",  "pct_dupe_of_mapped", "pct_non_exonic_of_non_dupe")
stat_label <- c("% unmapped", "% duplicate \n(of mapped)", "% non-exonic \n(of non-duplicate)")
names(stat_label) <- stat_names

Plot read fractions

colors_for_stat_names <- these_colors[c("Mapped reads", "Non-duplicate reads","Exonic reads")]
names(colors_for_stat_names) <- stat_names

these_colors <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", 
"#FDBF6F", "#FF7F00")
# light blue, dark blue, etc: green, red, orange (then purple and brown)
names(these_colors) <-  c("not assigned", "Total reads", "Non-exonic reads",  "Exonic reads", "Duplicate reads",  "Non-duplicate reads",  "Unmapped reads", "Mapped reads")

# 6 x 11 was a good ratio, but the text was too small

read_type_fractions_long_for_histogram <- read_type_fractions_long %>%
  mutate(read_type_fraction_name = factor(read_type_fraction_name,
                                          levels = stat_names))

read_type_fractions_histogram <-  ggplot(read_type_fractions_long_for_histogram) +
  # geom_histogram(aes(x = dataset_value, fill = read_type_fraction_name))  +
    geom_histogram(aes(x = dataset_value, color = read_type_fraction_name), 
                   fill = "white", stat = StatBin2)  +
  # scale_fill_brewer(palette = "Set1") +
  scale_color_manual(values = colors_for_stat_names) +
  facet_wrap(~read_type_fraction_name, 
             nrow = 1, 
             scales = "free_y",
             labeller = labeller(
               read_type_fraction_name = stat_label
               )
             ) +
  ylab("Datasets") +
  xlab(paste0("Read type percentages in ", length(unique(read_type_fractions_long_for_histogram$dataset_id)),  " datasets")) +
  scale_x_continuous(breaks = seq(0, 1, by = 0.5)) +
  #  expand_limits(x = c(-0.25, 0.9)) +
#  ggtitle(plot_title) +
    theme_minimal() +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
#  theme_linedraw(base_size = 20) +
    theme(strip.text.x = element_text(size = 12, face = "bold")) +
    theme(legend.position = "none")

read_type_fractions_histogram
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Statements about read fractions

# 75% of  datasets have fewer than 7% unmapped reads 
read_counts_with_read_type_fractions
## # A tibble: 2,178 x 26
## # Groups:   cohort [38]
##    dataset_id    MND   MEND Total_reads Mapped seq_length disease
##    <chr>       <dbl>  <dbl>       <dbl>  <dbl>      <dbl> <chr>  
##  1 TARGET-21… 1.75e8 7.59e7   667997032 4.59e8         75 acute …
##  2 THR33_099… 3.02e8 1.70e8   549163266 5.13e8         51 medull…
##  3 TARGET-21… 1.22e8 6.77e7   532566307 3.58e8         75 acute …
##  4 TARGET-21… 7.91e7 4.76e7   475949820 3.01e8         75 acute …
##  5 TARGET-21… 8.02e7 5.81e7   466002909 2.85e8         75 acute …
##  6 TARGET-21… 7.45e7 3.59e7   465622167 2.16e8         75 acute …
##  7 TARGET-21… 7.30e7 4.37e7   463732470 2.59e8         75 acute …
##  8 TARGET-21… 1.02e8 5.33e7   461425978 2.66e8         75 acute …
##  9 TARGET-21… 8.85e7 4.64e7   461393140 2.78e8         75 acute …
## 10 TARGET-21… 7.31e7 4.07e7   461017023 2.48e8         75 acute …
## # … with 2,168 more rows, and 19 more variables: dataset_prefix <chr>,
## #   age_at_dx <dbl>, pedaya <chr>, gender <chr>, doi_link <chr>,
## #   source_accession <chr>, repo <chr>, original_dataset_id <chr>,
## #   genes_expressed_above_0 <dbl>, genes_expressed_above_1 <dbl>,
## #   genes_expressed_above_2 <dbl>, genes_expressed_above_3 <dbl>,
## #   genes_expressed_above_4 <dbl>, genes_expressed_above_5 <dbl>,
## #   cohort <chr>, n_in_cohort <int>, pct_unmapped_of_total <dbl>,
## #   pct_dupe_of_mapped <dbl>, pct_non_exonic_of_non_dupe <dbl>
comparison_of_distributions
## # A tibble: 3 x 8
##   read_type_fraction_na… variance     min   max median    p25    p75    iqr
##   <chr>                     <dbl>   <dbl> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 pct_dupe_of_mapped      0.0589  0.0285  1.00  0.269  0.134  0.431  0.296 
## 2 pct_non_exonic_of_non…  0.0409  0.0450  0.970 0.248  0.162  0.373  0.211 
## 3 pct_unmapped_of_total   0.00606 0.00710 0.767 0.0341 0.0274 0.0601 0.0327

Calculate MEND reads as a fraction of total

 read_counts_with_MEND_fractions <- counts_and_meta  %>%
  arrange(desc(Total_reads)) %>%
  mutate(pct_MEND_of_total = MEND /Total_reads)

ggplot(read_counts_with_MEND_fractions) + 
  geom_histogram(aes(pct_MEND_of_total), stat = StatBin2) +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

MEND_pct <- read_counts_with_MEND_fractions %>% 
  ungroup %>%
  mutate(dataset_value = pct_MEND_of_total) %>%
  summarize(variance= var(dataset_value),
            min = min(dataset_value),
            max = max(dataset_value),
            median = median(dataset_value),
            p25 = quantile(dataset_value, 0.25),
            p75 = quantile(dataset_value, 0.75),
            iqr = IQR(dataset_value))

kable(MEND_pct %>%
        select(-variance, -p25, -p75) %>%
      mutate_if(is.double, ~100*.), digits = 0)
min max median iqr
0 79 50 31

Show per-cohort distribution of read_type_fractions

figure_name <- "fracs_by_group"

read_type_fractions_long_for_boxplot <- read_type_fractions_long_for_histogram %>%
  ungroup %>%
  mutate(boxplot_label = fct_reorder(paste0(as.character(cohort), ", n=", n_in_cohort), n_in_cohort))


read_type_fractions_per_cohort_boxplot <- ggplot(read_type_fractions_long_for_boxplot) +
  geom_boxplot(aes(x = dataset_value, y=boxplot_label), outlier.size = 0.5)  +
  facet_wrap(~read_type_fraction_name, 
             nrow = 1, 
             labeller = labeller(
               read_type_fraction_name = stat_label
             )
  ) +
  ylab("Cohorts") +
  xlab(paste0("Read type percentages in ", length(unique(read_type_fractions_long_for_boxplot$dataset_id)),  " datasets")) +
  scale_x_continuous(breaks = seq(0, 1, by = 0.5)) +
    theme_minimal() +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  theme(strip.text.x = element_text(size = 12, face = "bold")) +
  theme(legend.position = "none")


read_type_fractions_per_cohort_boxplot

# 
# f2ab <- plot_grid(read_type_fractions_histogram +
#             theme(strip.text.x = element_text(size = 10, face = "bold"),
#                   axis.title.x=element_blank(),
#                   axis.text.x=element_blank()),
#           read_type_fractions_per_cohort_boxplot +
#             theme(strip.text.x = element_blank(),
#                   strip.background = element_blank(),
#                   strip.text = element_blank()),
#           ncol = 1,
#           rel_heights = c(1.5, 6),
#           labels = "AUTO",
#           align = "v")

Characterize EGAD00001003279

Northcott, P.A., Buchhalter, I., Morrissy, A.S., Hovestadt, V., Weischenfeldt, J., Ehrenberger, T., Gröbner, S., Segura-Wang, M., Zichner, T., Rudneva, V.A., et al [Lichter]. (2017). The whole-genome landscape of medulloblastoma subtypes. Nature 547, 311–317.

two of the senior authors are from the German Cancer Research Center (DKFZ), one is Roland Eils German Cancer Research Center (DKFZ) Marco A. Marra from BC, Stefan M. Pfister German Cancer Research Center (DKFZ) Michael D. Taylor Sick kids Peter Lichter German Cancer Research Center (DKFZ)

RNA-Seq usage 1) CESAM integrates structural variant-derived breakpoints with RNA-seq data to identify expression changes associated with breakpoints in cis as described in previously37, by performing linear regression of expression (molecular phenotype) on structural variant-derived breakpoint (somatic genotype) data.

  1. Expression, e.g. d, t-SNE plots showing the relative, normalized expression intensities of GFI1, GFI1B, MYC, MYCN and PRDM6 in methylation subtypes (n = 219). e, Expression heat map showing transcriptional diversity among new MB subtypes (n = 248). (but could be from Affy)

Transcriptome data were acquired through RNA sequencing (RNA-seq; n = 164) and Affymetrix expression arrays (n = 392).

https://www.nature.com/articles/nature22973

read_type_names <- c("Total_reads", "Mapped",  "MND", "MEND")
this_cohort <- "EGAD00001003279"

in MS

 read_counts_with_read_type_fractions %>%
  filter(cohort == this_cohort) %>%
   mutate(gt_98 = pct_dupe_of_mapped > 0.98) %>%
   tabyl(gt_98) %>%
   add_totals_row
## Warning: 'add_totals_row' is deprecated.
## Use 'adorn_totals("row")' instead.
## See help("Deprecated")
##  gt_98   n   percent
##  FALSE  55 0.4330709
##   TRUE  72 0.5669291
##  Total 127 1.0000000
read_counts_with_read_type_fractions %>%
  filter(cohort == this_cohort) %>%
  filter(pct_dupe_of_mapped > 0.98) %>%
  pull(Total_reads) %>% min
## [1] 178294341

other

 counts_and_meta_long <- counts_and_meta  %>%
  pivot_longer(cols = c(Total_reads, Mapped, MEND, MND), names_to = "read_type_name", values_to = "dataset_value")  %>%
   ungroup %>%
   mutate(read_type_name = factor(read_type_name, levels = read_type_names))

# read_label <- c("% unmapped", "% duplicate \n(of mapped)", "% non-exonic \n(of non-duplicate)")
# names(read_label) <- read_type_names

counts_and_meta_long_one_proj <-  counts_and_meta_long %>%
  filter(cohort == this_cohort)

table(counts_and_meta_long_one_proj$read_type_name)
## 
## Total_reads      Mapped         MND        MEND 
##         127         127         127         127
# 78 datasets in the cohort have fewer than 0.2 million MEND reads. 
sum((counts_and_meta_long_one_proj %>% filter(read_type_name=="MEND"))$dataset_value < 200000)
## [1] 78
those_datasets <- counts_and_meta_long_one_proj %>% filter(read_type_name=="MEND", dataset_value < 200000) %>% pull(dataset_id)


summary((counts_and_meta_long_one_proj %>% filter(read_type_name=="Total_reads", dataset_id %in% those_datasets))$dataset_value)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 178294341 199916664 208667776 211606891 223832191 247618832
max_plots_to_include <- 10
n_datasets_to_plot <- ifelse(n_distinct(counts_and_meta_long_one_proj$dataset_id)>max_plots_to_include, 
                         max_plots_to_include, 
                         n_distinct(counts_and_meta_long_one_proj$dataset_id))
plot_word <- ifelse(n_datasets_to_plot==max_plots_to_include, max_plots_to_include, paste("all", n_distinct(counts_and_meta_long_one_proj$dataset_id)))

set.seed(100)
datasets_to_plot <- sample(unique(counts_and_meta_long_one_proj$dataset_id), n_datasets_to_plot)

ggplot(subset(counts_and_meta_long_one_proj, dataset_id %in% datasets_to_plot)) + 
  geom_col(aes(x=read_type_name, y=dataset_value/1e6, fill = read_type_name)) +
  facet_wrap(~dataset_id, ncol = 1, strip.position="right")  +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  theme(strip.text.y = element_text(angle = 0)) +
  scale_fill_brewer(palette = "Set1")  +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  ggtitle(paste("Read counts for", plot_word, "datasets from", this_cohort)) +
  coord_flip()

ggplot(counts_and_meta_long_one_proj %>% filter(read_type_name == "MEND")) + 
  geom_histogram(aes(dataset_value/1e6), stat = StatBin2) +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  ggtitle(paste("Distribution of MEND counts in", this_cohort))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

bl <- colorRampPalette(c("navy","royalblue","lightskyblue"))(200)                      
re <- colorRampPalette(c("mistyrose", "red2","darkred"))(200)


ggplot(counts_and_meta %>% 
         filter(cohort == this_cohort) %>%
         mutate(pct_dupes = (Mapped - MND)/Mapped)) + 
  geom_point(aes(x=MEND/1e6, y=pct_dupes, fill = Total_reads/1e6), shape=21, color = "black") +
    theme_minimal(base_size = 20) +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  # scale_color_gradientn() +
  scale_fill_gradientn(colours = c(bl,"white", re)) +
  ggtitle(paste("Relationship of dupe fraction to MEND count in", this_cohort), "For detecting whether datasets with less info were sequenced more deeply. \nNormally, higher depth datasets have more MEND reads and more duplicate \nreads than the same dataset sequenced at a lower total depth.")

counts_and_meta_long_one_proj %>%
  select(seq_length, dataset_id) %>%
  distinct %>%
  tabyl(seq_length)
##  seq_length   n percent
##          51 127       1

Expressed genes

What is the distribution of measured genes?

gene_counts_long <- counts_and_meta %>%
  pivot_longer(cols = starts_with("genes_"), names_to = "Expression_threshold")  %>%
  mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))  


ggplot(gene_counts_long) + 
  geom_histogram(aes(x=value/1e3, fill = Expression_threshold), stat = StatBin) +
  facet_wrap(~Expression_threshold)  +
    theme_minimal() +
  scale_fill_brewer(palette = "Set1") +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
#  theme_linedraw(base_size = 20) +
    theme(legend.position = "none") +
  # scale_fill_brewer(palette = "Set1") +
  ylab("Datasets") +
  xlab("Measured genes") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

What are the exact lowest numbers?

sort(counts_and_meta$genes_expressed_above_0) [1:100]
##   [1]     9     9    12    12    13    14    14    15    16    16    17
##  [12]    18    18    18    18    18    18    19    19    19    19    20
##  [23]    20    20    20    20    20    21    21    21    21    21    21
##  [34]    22    22    22    23    23    23    23    23    24    24    24
##  [45]    24    24    24    24    25    25    25    26    26    26    26
##  [56]    27    27    27    27    28    28    28    28    29    29    29
##  [67]    29    29    30    30    31    31    32    33    35    36    37
##  [78]    40  6098  7107 11830 15323 16793 16837 16943 17233 17531 18288
##  [89] 18588 18704 19626 19693 19749 19781 19942 19963 20063 20120 20203
## [100] 20247

correlations with all datasets

library(corrr)


counts_and_meta %>% 
    ungroup %>% 
    select_if(is.double) %>%
       correlate() %>%
  filter(rowname %in% read_type_names) %>%
  rename(Read_type_count=rowname) %>% 
  mutate(Read_type_count = factor(Read_type_count, levels = read_type_names)) %>%
  arrange(Read_type_count) %>%
  select(c(Read_type_count, starts_with("genes_expressed"))) %>%
  kable(caption = "Correlations with all datasets", digits = 2)
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
Correlations with all datasets
Read_type_count genes_expressed_above_0 genes_expressed_above_1 genes_expressed_above_2 genes_expressed_above_3 genes_expressed_above_4 genes_expressed_above_5
Total_reads -0.20 -0.40 -0.40 -0.40 -0.39 -0.39
Mapped -0.19 -0.41 -0.41 -0.41 -0.40 -0.39
MND 0.51 0.24 0.20 0.18 0.17 0.16
MEND 0.48 0.27 0.26 0.26 0.26 0.26

correlations with all but low gene count datasets

counts_and_meta %>% 
  filter(genes_expressed_above_0 > min_genes_gt_0) %>%
    ungroup %>% 
    select_if(is.double) %>%
       correlate() %>%
  filter(rowname %in% read_type_names) %>%
  rename(Read_type_count=rowname) %>% 
  mutate(Read_type_count = factor(Read_type_count, levels = read_type_names)) %>%
  arrange(Read_type_count) %>%
  select(c(Read_type_count, starts_with("genes_expressed"))) %>%
  kable(caption = "Correlations with all but low gene count datasets", digits = 2)
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
Correlations with all but low gene count datasets
Read_type_count genes_expressed_above_0 genes_expressed_above_1 genes_expressed_above_2 genes_expressed_above_3 genes_expressed_above_4 genes_expressed_above_5
Total_reads 0.13 -0.26 -0.28 -0.28 -0.28 -0.28
Mapped 0.21 -0.25 -0.26 -0.27 -0.27 -0.27
MND 0.51 0.04 0.02 0.00 0.00 0.00
MEND 0.44 0.08 0.09 0.10 0.11 0.12

Plot correlations betweeen gene counts and read types

Groom data

expr_gene_and_read_counts_to_plot <- counts_and_meta %>%
  pivot_longer (cols = c(MND, MEND, Total_reads, Mapped), names_to = "Read_type", values_to = "Read_count") %>%
  pivot_longer (cols = starts_with("genes_expressed"), names_to = "Expression_threshold", values_to = "Gene_count") %>%
  mutate(Read_type = factor(Read_type, levels = read_type_names))

Identify datasets with unusual gene counts or read depths

datasets_with_normal_expressed_gene_numbers <- counts_and_meta %>%
  filter(genes_expressed_above_0 > min_genes_gt_0) %>%
  pull(dataset_id)

datasets_with_outlier_gene_counts <- counts_and_meta %>%
  ungroup %>%
  filter(is_outlier(genes_expressed_above_0)) %>%
  pull(dataset_id) 

datasets_with_outlier_depths <- counts_and_meta %>%
  ungroup %>%
  filter(is_outlier(Total_reads)) %>%
  pull(dataset_id) 

Function for calculating gene count correlations and plotting results

plot_gene_count_correlations <- function(this_data = expr_gene_and_read_counts_to_plot, this_plot_title = "gene count correlations"){

this_data <- this_data %>% 
  mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))  
  
  
these_stats <- this_data  %>%
group_by(Read_type, Expression_threshold) %>%
  summarize(r_corr = round(cor(Gene_count, Read_count), 2)) 

# unique(this_data$Read_type)
# dput(unique(this_data$Read_type))
# these_colors_with_MEND

colors_for_correlation_plot <- c("Total_reads"="#1F78B4", "Mapped"="#FF7F00", 
"MND"="#E31A1C", "MEND"="#000000")

ggplot(this_data,
       aes(x=Read_count/1E6, y=Gene_count/1e3, color = Read_type)) + 
  geom_point(alpha = 0.5, shape = 20, size =0.1) +
  geom_smooth(method = "lm", color = "black") +
  geom_label(data = these_stats, 
             aes(label=paste0("r=", r_corr)), 
             x=max(this_data$Read_count/1E6),
             y=max(this_data$Gene_count/1e3),
             hjust = 1, vjust = 1
             ) +
    theme_minimal() +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5)) +
  scale_color_manual(values = colors_for_correlation_plot) +
  facet_grid(Read_type~Expression_threshold) +
  ggtitle(this_plot_title)  + 
  theme(legend.position="none") +
  xlab("Read counts (million)") +
  ylab("Gene counts (thousand)")

}

for all datasets

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot,
                             "Correlations calculated across all datasets")
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>% 
  filter(dataset_id %in% datasets_with_normal_expressed_gene_numbers),
  paste("Correlations in datasets with more than", min_genes_gt_0, "genes expressed above 0"))
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>% 
  filter(! dataset_id %in% datasets_with_outlier_gene_counts),
  paste("Correlations calculated across all but outlier gene count datasets"))
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>% 
  filter(! dataset_id %in% datasets_with_outlier_depths),
  paste("Correlations calculated across all but outlier read depth datasets"))
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(expr_gene_and_read_counts_to_plot %>% 
  filter(! dataset_id %in% datasets_with_outlier_depths, 
         ! dataset_id %in% datasets_with_outlier_gene_counts
         ),
  paste("Correlations calculated across all but datasets with outlier read depth or gene count"))
## `geom_smooth()` using formula 'y ~ x'

plot_gene_count_correlations(
  expr_gene_and_read_counts_to_plot %>% 
    filter(disease ==  "acute lymphoblastic leukemia"),
  "Correlations calculated among acute lymphoblastic leukemia datasets")
## `geom_smooth()` using formula 'y ~ x'

corr plot for figure

max_total_read_depth <- 250*1e6


sum(counts_and_meta$genes_expressed_above_0 <= min_genes_gt_0)
## [1] 78
sum(counts_and_meta$Total_reads > max_total_read_depth)
## [1] 105
# min_genes_gt_0
exclude_for_depth <- counts_and_meta %>%
  filter(Total_reads > max_total_read_depth) %>%
  pull(dataset_id)

exclude_for_gene_count <- counts_and_meta %>%
  filter(genes_expressed_above_0 < min_genes_gt_0) %>%
  pull(dataset_id)


this_data <- expr_gene_and_read_counts_to_plot %>% 
      filter(
        Expression_threshold %in% paste0("genes_expressed_above_", 1:3),
       ! dataset_id %in% exclude_for_depth, 
        ! dataset_id %in% exclude_for_gene_count
      ) %>%
  mutate(Expression_threshold = paste(str_replace(Expression_threshold, "genes_expressed_above_", "genes > "), "TPM"))  

n_distinct(this_data$dataset_id)
## [1] 1995
this_plot_title <- paste("Correlations calculated across all but datasets with excessive read depth or low gene count")

these_stats <- this_data  %>%
group_by(Read_type, Expression_threshold) %>%
  summarize(r_corr = round(cor(Gene_count, Read_count), 2))

colors_for_correlation_plot <- c("Total_reads"="#1F78B4", "Mapped"="#FF7F00", 
"MND"="#E31A1C", "MEND"="#000000")

cor_plots_excluding_high_read_depth_and_low_gene_count_datasets <- ggplot(this_data,
       aes(x=Read_count/1E6, y=Gene_count/1e3, color = Read_type)) + 
  geom_point(alpha = 0.5, shape = 20, size =0.1) +
  geom_smooth(method = "lm", color = "black") +
  geom_label(data = these_stats, 
             aes(label=paste0("r=", r_corr)), 
             x=max(this_data$Read_count/1E6),
             y=max(this_data$Gene_count/1e3),
             hjust = 1, vjust = 1
             ) +
    theme_minimal() +
    theme(axis.line.x = element_line(color="darkgrey", size = 0.5),
        axis.line.y = element_line(color="darkgrey", size = 0.5),
        strip.text = element_text(face = "bold")) +
  scale_color_manual(values = colors_for_correlation_plot) +
  facet_grid(Read_type~Expression_threshold) +
#  ggtitle(this_plot_title)  + 
  theme(legend.position="none") +
  scale_x_continuous("Read counts (million)", n.breaks = 4) +
  ylab("Gene counts (thousand)") 


cor_plots_excluding_high_read_depth_and_low_gene_count_datasets
## `geom_smooth()` using formula 'y ~ x'

# Fig 2

f2a <- read_type_fractions_histogram +
            theme(strip.text.x = element_text(size = 10, face = "bold"),
                  axis.title.x=element_blank(),
                  axis.text.x=element_blank())

f2b <- read_type_fractions_per_cohort_boxplot +
            theme(strip.text.x = element_blank(),
                  strip.background = element_blank(),
                  strip.text = element_blank())

f2ab <- plot_grid(f2a, 
                  f2b,
                  ncol = 1,
                  rel_heights = c(1.5, 6),
                  align = "v",
                  labels = "AUTO")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
f2c <- plot_grid(cor_plots_excluding_high_read_depth_and_low_gene_count_datasets,
          labels = "C")
## `geom_smooth()` using formula 'y ~ x'
f2 <- plot_grid(f2ab,
                 f2c,
                 nrow=1,
                 rel_widths = c(4,2.3))

figure_name <- "f2"
f2 +   draw_label(paste(figure_name, Sys.time()),
             x = 0, y = 0.02, hjust = 0, size = 6) 

Plot sequence length against pct duplicates

ggplot(read_counts_with_read_type_fractions) + 
  geom_point(aes(x=seq_length, y=pct_dupe_of_mapped))
## Warning: Removed 245 rows containing missing values (geom_point).

ggplot(read_counts_with_read_type_fractions) + 
  geom_boxplot(aes(x=seq_length, y=pct_dupe_of_mapped, group = seq_length))
## Warning: Removed 245 rows containing missing values (stat_boxplot).

Does the depth of sequencing correlate to the number of duplicates? How much variance does that explain?

cor_tot_reads_and_dupes <- cor(read_counts_with_read_type_fractions$Total_reads, 
    read_counts_with_read_type_fractions$pct_dupe_of_mapped,
    method = c("pearson"))
round(cor_tot_reads_and_dupes, 2)
## [1] 0.58
round(cor_tot_reads_and_dupes^2, 2)
## [1] 0.34

SessionInfo

pander(sessionInfo(), compact = FALSE)

R version 3.6.0 (2019-04-26)

Platform: x86_64-apple-darwin15.6.0 (64-bit)

locale: en_US.UTF-8||en_US.UTF-8||en_US.UTF-8||C||en_US.UTF-8||en_US.UTF-8

attached base packages:

  • stats
  • graphics
  • grDevices
  • utils
  • datasets
  • methods
  • base

other attached packages:

  • corrr(v.0.4.0)
  • kableExtra(v.1.1.0)
  • pander(v.0.6.3)
  • RColorBrewer(v.1.1-2)
  • cowplot(v.1.0.0)
  • ggpubr(v.0.2.5)
  • magrittr(v.1.5)
  • ggthemes(v.4.2.0)
  • knitr(v.1.25)
  • viridis(v.0.5.1)
  • viridisLite(v.0.3.0)
  • janitor(v.1.2.0)
  • forcats(v.0.4.0)
  • stringr(v.1.4.0)
  • dplyr(v.0.8.5)
  • purrr(v.0.3.3)
  • readr(v.1.3.1)
  • tidyr(v.1.0.0)
  • tibble(v.3.0.1)
  • ggplot2(v.3.3.0)
  • tidyverse(v.1.2.1)

loaded via a namespace (and not attached):

  • Rcpp(v.1.0.3)
  • lubridate(v.1.7.4)
  • lattice(v.0.20-38)
  • assertthat(v.0.2.1)
  • digest(v.0.6.23)
  • utf8(v.1.1.4)
  • R6(v.2.4.1)
  • cellranger(v.1.1.0)
  • backports(v.1.1.5)
  • evaluate(v.0.14)
  • httr(v.1.4.1)
  • highr(v.0.8)
  • pillar(v.1.4.3)
  • rlang(v.0.4.6)
  • readxl(v.1.3.1)
  • rstudioapi(v.0.10)
  • Matrix(v.1.2-17)
  • rmarkdown(v.1.16)
  • splines(v.3.6.0)
  • labeling(v.0.3)
  • webshot(v.0.5.1)
  • munsell(v.0.5.0)
  • broom(v.0.5.2)
  • compiler(v.3.6.0)
  • modelr(v.0.1.5)
  • xfun(v.0.10)
  • pkgconfig(v.2.0.3)
  • mgcv(v.1.8-29)
  • htmltools(v.0.4.0)
  • tidyselect(v.1.1.0)
  • gridExtra(v.2.3)
  • fansi(v.0.4.0)
  • crayon(v.1.3.4)
  • withr(v.2.1.2)
  • grid(v.3.6.0)
  • nlme(v.3.1-141)
  • jsonlite(v.1.6)
  • gtable(v.0.3.0)
  • lifecycle(v.0.2.0)
  • scales(v.1.1.0)
  • cli(v.2.0.0)
  • stringi(v.1.4.3)
  • farver(v.2.0.1)
  • ggsignif(v.0.6.0)
  • xml2(v.1.2.2)
  • ellipsis(v.0.3.0)
  • generics(v.0.0.2)
  • vctrs(v.0.3.0)
  • tools(v.3.6.0)
  • glue(v.1.4.1)
  • hms(v.0.5.1)
  • yaml(v.2.2.0)
  • colorspace(v.1.4-1)
  • rvest(v.0.3.4)
  • haven(v.2.1.1)